# Used to display all the matplotlib graphs inside the notebook
%matplotlib inline
# Hiding the warnings
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set() # Setting the default seaborn style for all the plots
data = pd.read_csv('vehicle-1.csv') # Storing the given data into a data frame called as "data"
data.shape # The data has got 195 rows and 24 columns
data.head() # Checking the head of the data frame
# Checking for duplicates and removing the duplicates
print('The shape before removing the duplicates from the data set is :',data.shape)
data = data.drop_duplicates()
print('The shape after removing the duplicates from the data set is :',data.shape)
data.info() # Checking for the data type of each of the variables and the number of non-null values
data.nunique() # To check for the presence of categorical variables
data.isnull().sum()
# Replacing the null values in each of the columns using linear interpolation
columns = list(data.columns)
for column in columns:
if data[column].count() != data.shape[0]:
data[column] = data[column].interpolate(method="linear")
data.isnull().sum()
sns.pairplot(data)
data.describe().T
corr = data.corr() # Finding the correlation between the variables
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(corr, annot=True,ax=ax,cmap='Pastel2')
sns.countplot(data['class'],edgecolor = 'black', alpha = 0.7)
# Getting the count of the distribution of records in each of the three classes namely van,car and bus
grouped_data = data.groupby('class').size()
print(grouped_data)
X = data.drop('class',axis=1)
c_cols = X.columns.to_list()
for column in c_cols :
plt.close()
sns.distplot(data[column])
plt.show()
# Checking for the skewness of the predictors
from scipy import stats
skews = []
for i in c_cols:
skew = stats.skew(data[i])
skews.append(skew)
for i in range(len(skews)) :
print('The skewness of ',c_cols[i],'is: ',skews[i])
# Using boxplots to find out the outliers (Bivariate Analysis)
for i in c_cols:
data.boxplot(i,by='class')
plt.tight_layout()
plt.title('',y=1.08)
plt.xlabel('Class')
plt.ylabel(i)
value = data[(data['radius_ratio']<275) & (data['class']=='van')]['radius_ratio'].max()
value
data['radius_ratio'][data['radius_ratio']>275]=value # Replacing the outlier with the logical value
value = data[(data['radius_ratio']<200) & (data['class']=='van')]['radius_ratio'].max()
value
data['radius_ratio'][(data['radius_ratio']>200) & (data['class']=='van')]=value # Replacing the outlier with the logical value
value = data[(data['class']=='bus')&(data['pr.axis_aspect_ratio']<80)]['pr.axis_aspect_ratio'].max()
value
data['pr.axis_aspect_ratio'][(data['pr.axis_aspect_ratio']>=value) & (data['class']=='bus')]=value
value = data[(data['class']=='van')&(data['pr.axis_aspect_ratio']<80)]['pr.axis_aspect_ratio'].max()
value
data['pr.axis_aspect_ratio'][(data['pr.axis_aspect_ratio']>=value) & (data['class']=='van')]=value
value = data[(data['class']=='bus')&(data['max.length_aspect_ratio']<15)]['max.length_aspect_ratio'].max()
value
data['max.length_aspect_ratio'][(data['max.length_aspect_ratio']>=value) & (data['class']=='bus')]=value
value = data[(data['class']=='van')&(data['max.length_aspect_ratio']<20)]['max.length_aspect_ratio'].max()
value
data['max.length_aspect_ratio'][(data['max.length_aspect_ratio']>value) & (data['class']=='van')]=value
value = data[(data['class']=='van')&(data['scaled_variance']<=225)]['scaled_variance'].max()
value
data['scaled_variance'][(data['scaled_variance']>value) & (data['class']=='van')]=value
value = data[(data['class']=='van')&(data['scaled_variance']<200)]['scaled_variance'].max()
value
data['scaled_variance'][(data['scaled_variance']>value) & (data['class']=='van')]=value
value = data[(data['class']=='bus') & (data['scaled_radius_of_gyration.1']<110)]['scaled_radius_of_gyration.1'].max()
value
data['scaled_radius_of_gyration.1'][(data['scaled_radius_of_gyration.1']>value) & (data['class']=='bus')]=value
value = data[(data['class']=='van') & (data['scaled_radius_of_gyration.1']<110)]['scaled_radius_of_gyration.1'].max()
value
data['scaled_radius_of_gyration.1'][(data['scaled_radius_of_gyration.1']>value) & (data['class']=='van')]=value
value = data[(data['class']=='van') & (data['scaled_radius_of_gyration.1']<95)]['scaled_radius_of_gyration.1'].max()
value
data['scaled_radius_of_gyration.1'][(data['scaled_radius_of_gyration.1']>value) & (data['class']=='van')]=value
value = data[(data['class']=='bus') & (data['skewness_about']<15)]['skewness_about'].max()
value
data['skewness_about'][(data['skewness_about']>value) & (data['class']=='bus')]=value
value1 = data[(data['class']=='van') & (data['skewness_about']<20)]['skewness_about'].max()
value1
value = data[(data['class']=='van') & (data['skewness_about']<value1)]['skewness_about'].max()
value
data['skewness_about'][(data['skewness_about']>value) & (data['class']=='van')]=value
value1 = data[(data['class']=='van') & (data['skewness_about.1']<30)]['skewness_about.1'].max()
value1
value = data[(data['class']=='van') & (data['skewness_about.1']<value1)]['skewness_about.1'].max()
value
data['skewness_about.1'][(data['skewness_about.1']>value) & (data['class']=='van')]=value
value = data[(data['class']=='car') & (data['skewness_about.2']<205)]['skewness_about.2'].max()
value
data['skewness_about.2'][(data['skewness_about.2']>value) & (data['class']=='car')]=value
value = data[(data['class']=='bus') & (data['pr.axis_rectangularity']<=24)]['pr.axis_rectangularity'].max()
value
data['pr.axis_rectangularity'][(data['pr.axis_rectangularity']>value) & (data['class']=='bus')]=value
value = data[(data['class']=='van') & (data['pr.axis_rectangularity']<23)]['pr.axis_rectangularity'].max()
value
data['pr.axis_rectangularity'][(data['pr.axis_rectangularity']>value) & (data['class']=='van')]=value
for i in c_cols:
data.boxplot(i,by='class')
plt.tight_layout()
plt.title('',y=1.08)
plt.xlabel('Class')
plt.ylabel(i)
H0 : There is NO relation between the given variables (p > 0.05)
H1 : There is a relation between the given variables (p <= 0.05)
from scipy.stats import f_oneway
anova = []
for i in c_cols :
l = data.groupby('class')[i].apply(list)
res = f_oneway(*l)
anova.append(i +': '+str(res[1]))
for i in anova:
print(i)
data.to_pickle('object_detection_data.pickle')
detection_data = pd.read_pickle('object_detection_data.pickle')
detection_data.head()
detection_data.isnull().sum()
# Separating the target variable from the predictor variables
x = detection_data.drop('class',axis=1).values
y = detection_data[['class']].values
y = y.reshape(len(y),1)
print('The shape of X is: ',x.shape)
print('The shape of y is: ',y.shape)
from scipy.stats import zscore
X = data.drop('class',axis=1)
X_scaled = X.apply(zscore)
X = X_scaled.values
print('The shape of X is: ',X.shape)
print('The shape of y is: ',y.shape)
X_scaled.head()
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=14)
print('The shape of X_train is: ',X_train.shape)
print('The shape of y_train is: ',y_train.shape)
print('The shape of X_test is: ',X_test.shape)
print('The shape of y_test is: ',y_test.shape)
from sklearn import svm
model_svm = svm.SVC(C=20, kernel='rbf', gamma=0.01)
print(model_svm)
model_SVM = model_svm.fit(X_train,y_train)
y_pred = model_SVM.predict(X_test)
from sklearn import metrics
cm_svm=metrics.confusion_matrix(y_test, y_pred, labels=['bus', 'car','van'])
df_cm_svm = pd.DataFrame(cm_svm, index = [i for i in ["bus","car","van"]],
columns = [i for i in ["Predict bus","Predict car","Predict van"]])
plt.figure(figsize = (9,7))
sns.heatmap(df_cm_svm, annot=True,fmt='g',cmap='afmhot')
print(metrics.classification_report(y_test,y_pred))
accuracy_svm = metrics.classification_report(y_test, y_pred).split()[-2]
accuracy_percentage_svm = float(accuracy_svm)*100
print('The Accuracy of the SVM model is :',accuracy_percentage_svm,'%')
score_svm = np.round((model_svm.score(X_test,y_test)*100),2)
print('The score of the SVM model on the test data is: ',score_svm,'%')
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
cv = KFold(n_splits=10, random_state=19, shuffle=True)
k_fold_accuracy_scores = cross_val_score(model_svm,X,y,cv=cv)
k_fold_accuracy_score_svm = round((k_fold_accuracy_scores*100).mean(),2)
print('\nThe final average accuracy of the SVM model using K-fold-cross validation is: ',k_fold_accuracy_score_svm,'%')
from sklearn.decomposition import PCA
# Creating maximum components equal to the total number of columns to analyze all of them
pca = PCA(n_components=X.shape[1])
# fitting the data
pca_fit = pca.fit(X)
# calculating the principal components
reduced_X = pca_fit.transform(X)
# 18 columns present in X are now represented by 18 Principal components present in reduced_X
print(np.round(reduced_X[0:3],2))
# The Amount of variance explained by each principle componnent
var_explained= pca.explained_variance_ratio_
print(np.round(var_explained,2))
# The first component and second component explain 54% and 18% variance of data respectively
#Cumulative Variance explained by each component
var_explained_cumulative=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
print(var_explained_cumulative)
# By Looking at below graph we can see that >6 components are explaining maximum Variance in the dataset
plt.plot( range(1,19), var_explained_cumulative )
plt.xlabel('Number of components')
plt.ylabel('% Variance explained')
# Choosing 6-principal components based on above graph
pca = PCA(n_components=6)
# fitting the data
pca_fit=pca.fit(X)
# calculating the principal components
reduced_X = pca_fit.transform(X)
# 18 Columns present in X are now represented by 6-Principal components present in reduced_X
print(np.round(reduced_X[0:10],2))
sns.pairplot(pd.DataFrame(reduced_X))
PCA_detection_data=pd.DataFrame(reduced_X, columns=['PC1','PC2','PC3','PC4','PC5','PC6'])
PCA_detection_data['Class']=y
PCA_detection_data.head()
X_PCA = PCA_detection_data.drop('Class',axis=1)
y_PCA = PCA_detection_data[['Class']].values
y_PCA = y_PCA.reshape(len(y),1)
print('The shape of X_PCA is: ',X_PCA.shape)
print('The shape of y_PCA is: ',y_PCA.shape)
X_PCA.head()
X_train_PCA,X_test_PCA,y_train_PCA,y_test_PCA = train_test_split(X_PCA,y_PCA,test_size=0.3,random_state=14)
print('The shape of X_train_PCA is: ',X_train_PCA.shape)
print('The shape of y_train_PCA is: ',y_train_PCA.shape)
print('The shape of X_test_PCA is: ',X_test_PCA.shape)
print('The shape of y_test_PCA is: ',y_test_PCA.shape)
from sklearn import svm
model_svm_pca = svm.SVC(C=20, kernel='rbf', gamma=0.01)
print(model_svm_pca)
model_SVM_PCA = model_svm_pca.fit(X_train_PCA,y_train_PCA)
y_pred_PCA = model_SVM_PCA.predict(X_test_PCA)
from sklearn import metrics
cm_svm_pca=metrics.confusion_matrix(y_test_PCA, y_pred_PCA, labels=['bus', 'car','van'])
df_cm_svm_pca = pd.DataFrame(cm_svm_pca, index = [i for i in ["bus","car","van"]],
columns = [i for i in ["Predict bus","Predict car","Predict van"]])
plt.figure(figsize = (9,7))
sns.heatmap(df_cm_svm_pca, annot=True,fmt='g',cmap='afmhot')
print(metrics.classification_report(y_test_PCA,y_pred_PCA))
accuracy_svm = metrics.classification_report(y_test_PCA, y_pred_PCA).split()[-2]
accuracy_percentage_svm = float(accuracy_svm)*100
print('The Accuracy of the SVM model using PCA is :',accuracy_percentage_svm,'%')
score_svm_pca = np.round((model_svm_pca.score(X_test_PCA,y_test_PCA)*100),2)
print('The score of the SVM model on the test data is: ',score_svm_pca,'%')
from sklearn.model_selection import cross_val_score
cv = KFold(n_splits=10, random_state=19, shuffle=True)
k_fold_accuracy_scores = cross_val_score(model_svm_pca,X_PCA,y_PCA,cv=cv)
k_fold_accuracy_score_svm_pca = round((k_fold_accuracy_scores*100).mean(),2)
print('\nThe final average accuracy of the SVM model using PCA and K-fold-cross validation is: ', k_fold_accuracy_score_svm_pca,'%')
Accuracy_Scores = pd.DataFrame(data=[[score_svm,k_fold_accuracy_score_svm],[score_svm_pca,k_fold_accuracy_score_svm_pca]],index=['SVM-Raw Data','SVM with PCA'],columns=['Accuracy Score (%)','Cross Validation Score (%)'])
Accuracy_Scores